import logging
from init import PATHS
LOGGER = logging.getLogger(__name__)
import numpy as np
import pandas as pd


def main():
    global beg_year, end_year, patenting_var, sectors
    beg_year, end_year = 2003, 2017
    sectors = ['333', '334', '335', '336', '511', '541']
    # Define relevant columns only
    patent_count_vars = ['Count' + ss for ss in ['', '_ICE', '_EffICE', '_CleanCar', '_Bat', '_FC']]
    patent_stock_vars = ['Stock' + ss for ss in ['', '_ICE', '_EffICE', '_CleanCar', '_Bat', '_FC']]
    patenting_var = patent_count_vars + patent_stock_vars
    # Import datasets
    global oems, df_supp_patcount, naics, links
    oems, df_supp_patcount, naics, links = import_datasets()
    df_oem_patcount_via_suppliers = []
    for oem_id in oems['OEM_Level1_ID'].unique():
        print(oem_id)
        for year in np.arange(beg_year, end_year):
            supp_patenting = patentingcounts_of_active_and_new_suppliers(oem_id, year)
            df_oem_patcount_via_suppliers.append(supp_patenting)
    df_oem_patcount_via_suppliers = pd.concat(df_oem_patcount_via_suppliers)
    df_oem_patcount_via_suppliers = df_oem_patcount_via_suppliers.reset_index()
    col_order = ['OEM_Level1_ID', 'earliest_filing_year']
    col_order.extend([k for k in df_oem_patcount_via_suppliers.columns if k not in col_order])
    df_oem_patcount_via_suppliers = df_oem_patcount_via_suppliers[col_order]
    df_oem_patcount_via_suppliers.to_csv(PATHS.dataoutput / 'C_PatentVariables/Panel_OEMS_supplierspillovers.csv')


def import_datasets():
    oems = pd.read_csv(PATHS.dataoutput / 'C_PatentVariables/Panel_OEMs_level1_with_subsi_FamInfo.csv')
    maskYears = (oems['earliest_filing_year'] >= beg_year) & (oems['earliest_filing_year'] < end_year)
    oems = oems[maskYears][['OEM_Level1_ID', 'earliest_filing_year']]
    # Supplier data
    df_supp_patcount = pd.read_csv(PATHS.dataoutput / 'C_PatentVariables/Panel_suppliers_FamInfo.csv')
    naics = pd.read_csv(PATHS.dataoutput / 'B_FactsetVariables/suppliers_naics.csv')
    naics = naics.rename(columns={'BvD ID number': 'bvdid'})
    naics['3digitNAICS'] = naics['NAICS, Primary code(s)'].astype(str).str[:3]
    df_supp_patcount = df_supp_patcount.merge(naics, on='bvdid', how='left')
    # missing naics for 318 out of 27633 suppliers
    links = pd.read_csv(PATHS.dataoutput / 'B_FactsetVariables/SC_ProdSuppPanel.csv')
    links = links.rename(columns={'SUPPLIER_ID': 'fctid', 'supplier_fctname': 'fctname'})
    supplierids = pd.read_csv(PATHS.dataoutput / 'B_FactsetVariables/suppliers_ids.csv')
    supplierids = supplierids[['fctid', 'fctname', 'bvdid']]
    # needed to link between the fctids of links and the bvdids of suppliers.
    links = links.merge(supplierids, on=['fctid', 'fctname'])
    return oems, df_supp_patcount, naics, links


def patentingcounts_of_active_and_new_suppliers(oem_id, year):
    # This function calculates the patent counts and stocks that the oem has "access" to via its suppliers
    # First we consider active suppliers only, then the new suppliers.
    act_supp_patenting = get_counts_active_suppliers(oem_id, year)
    new_supp_patenting, list_new_suppliers = get_counts_new_suppliers(oem_id, year)
    # Combining results for active and new suppliers
    supp_patenting = pd.concat([act_supp_patenting, new_supp_patenting], axis=1).reset_index()
    supp_patenting.columns = ['pat_var', 'ActSupp', 'NewSupp']
    supp_patenting['OEM_Level1_ID'] = oem_id
    supp_patenting = supp_patenting.pivot(index='OEM_Level1_ID', columns='pat_var', values=['ActSupp', 'NewSupp'])
    supp_patenting.columns = ['_'.join(column).strip() for column in supp_patenting.columns.values]
    supp_patenting['earliest_filing_year'] = year
    # count suppliers by type
    new_suppliers_naics = count_the_nbr_of_suppliers_by_naics(oem_id, list_new_suppliers)
    supp_patenting = supp_patenting.merge(new_suppliers_naics, on='OEM_Level1_ID')
    # get patenting counts by naics sector
    sectoral_suppliers_patents = patentingcounts_of_suppliers_by_NAICS_3digit_sector(oem_id, year)
    supp_patenting = supp_patenting.merge(sectoral_suppliers_patents, on='OEM_Level1_ID', how='left')
    return supp_patenting


def get_counts_active_suppliers(oem_id, year):
    # Active suppliers
    mask_activelinks = (links['OEM_Level1_ID'] == oem_id) & (links['Year'] == year) & (links['Active'] == True)
    list_active_suppliers = links[mask_activelinks]['bvdid'].tolist()
    # not all active suppliers patent -- for robustness, we should get also patents of owners and subsidiaries
    mask_actsupp = df_supp_patcount['bvdid'].isin(list_active_suppliers) & (df_supp_patcount['earliest_filing_year'] == year)
    act_supp_patenting = df_supp_patcount[mask_actsupp]
    act_supp_patenting = act_supp_patenting[patenting_var].sum()
    return act_supp_patenting


def get_counts_new_suppliers(oem_id, year):
    mask_firstime_supplier = (links['OEM_Level1_ID'] == oem_id) & (links['Year'] == year) & (links['Years_since_firstactive'] == 0)
    list_new_suppliers = links[mask_firstime_supplier]['bvdid'].tolist()
    mask_newsupplier = (df_supp_patcount['bvdid'].isin(list_new_suppliers)) & (df_supp_patcount['earliest_filing_year'] == year)
    new_supp_patenting = df_supp_patcount[mask_newsupplier]
    new_supp_patenting = new_supp_patenting[patenting_var].sum()
    return new_supp_patenting, list_new_suppliers


def count_the_nbr_of_suppliers_by_naics(oem_id, list_new_suppliers):
    # Add number of new suppliers by sector
    default_df = pd.DataFrame({'3digitNAICS': sectors})
    mask_supp_in_sectors = naics['bvdid'].isin(list_new_suppliers) & naics['3digitNAICS'].isin(sectors)
    new_suppliers_naics = naics[mask_supp_in_sectors].groupby('3digitNAICS')['bvdid'].count().reset_index().rename(columns={'bvdid': 'nb_new_suppliers'})
    new_suppliers_naics = new_suppliers_naics.merge(default_df, on='3digitNAICS', how='outer').fillna(0).reset_index()
    new_suppliers_naics['OEM_Level1_ID'] = oem_id
    new_suppliers_naics = new_suppliers_naics.pivot(index='OEM_Level1_ID', columns='3digitNAICS', values=['nb_new_suppliers'])
    new_suppliers_naics.columns = ['_'.join(column).strip() for column in new_suppliers_naics.columns.values]
    new_suppliers_naics['nb_new_suppliers_total'] = len(list_new_suppliers)
    return new_suppliers_naics


def patentingcounts_of_suppliers_by_NAICS_3digit_sector(oem_id, year):
    all_cols = [f'{i}_{j}' for i in patenting_var for j in sectors]
    mask_hasbeen_a_supplier_atsomepoint = (links['OEM_Level1_ID'] == oem_id) & (links['Year'] == year) & (links['Years_since_firstactive'] >= 0)
    list_past_present_suppliers = links[mask_hasbeen_a_supplier_atsomepoint]['bvdid'].tolist()
    mask_pastpresent_supp = df_supp_patcount['bvdid'].isin(list_past_present_suppliers) & (df_supp_patcount['earliest_filing_year'] == year) & df_supp_patcount['3digitNAICS'].isin(sectors)
    pastpresent_supp_patenting = df_supp_patcount[mask_pastpresent_supp]
    if pastpresent_supp_patenting.shape[0] == 0:
        nulldf = pd.DataFrame(0, index=[oem_id], columns=all_cols)
        nulldf['OEM_Level1_ID'] = oem_id
        nulldf.index = nulldf['OEM_Level1_ID']
        nulldf = nulldf.drop(columns=['OEM_Level1_ID'])
        return nulldf
    sectoral_suppliers_patents = pastpresent_supp_patenting.groupby('3digitNAICS')[patenting_var].sum().reset_index()
    sectoral_suppliers_patents['OEM_Level1_ID'] = oem_id
    sectoral_suppliers_patents = pd.pivot(sectoral_suppliers_patents, index='OEM_Level1_ID', columns='3digitNAICS', values=patenting_var)
    sectoral_suppliers_patents.columns = ['_'.join(column).strip() for column in sectoral_suppliers_patents.columns.values]
    for col in all_cols:
        if col not in sectoral_suppliers_patents.columns:
            sectoral_suppliers_patents[col] = 0
    return sectoral_suppliers_patents


